/* 
  Code: two_stage example.sas
  Author: Paul von Hippel
  Version: 1.0
  Date: July 19, 2017
*/
/* In this code, which parallels Section 2 in my article "How many imputations do you need?", 
   I show how to automate a two-stage procedure 
    for choosing an appropriate number of imputations. */
/* I first show the basic steps and then show how they can be streamlined with macros. */

/* Before anything else, copy the provided data, bmi_observed, into the work directory. */
/* To do this, run the code below. 
    You will need to change the libname statement to point to the 
    folder on your system that contains the data. */
libname repdata "C:\Users\pvonhippel\Box Sync\More imputations\Supplement\";
data bmi_observed;
 set repdata.bmi_observed;
run;

/* Now use the SURVEYMEANS procedure in SAS to get a point estimate and SE 
    using listwise deletion. 
   This isn't necessary to illustrate MI but it provides a point of comparison. 
   The estimate adjusts for features of the complex random sample, 
    including sampling weights, sample clusters, and sample strata.
*/
proc surveymeans data=bmi_observed plots=none;
 var c3bmi;
run;


/******************* BASIC TWO-STAGE APPROACH *************************************/
/* To use the two-stage approach, start with a pilot using M=5 imputations.
    To do this, use SASs MI procedure to impute missing BMIs 
    under a multivariate normal model for the BMIs in rounds 1-4. */
proc mi NIMPUTE=5 data=bmi_observed out=mi_data;
  var c1bmi c2bmi c3bmi c4bmi /* BMIs */; 
run;
/* 
   The output from the MI procedure is a new dataset, called mi_data, 
    which contains M=5 imputed copies of the original dataset, 
    stacked and indexed by a new variable called _imputation_. 
    You can inspect it, for example, by opening mi_data in the SAS Explorer window. 
*/
/* Now use the SURVEYMEANS procedure again to analyze each of the 
    M=5 imputed datasets as though it were complete.
*/
proc surveymeans data=mi_data plots=none;
 var c3bmi;
 BY _IMPUTATION_;
 ODS OUTPUT STATISTICS=ests;
run;
/* The output is a new SAS dataset (called ests), 
    which contains M=5 mean estimates (called mean) and SE estimates (called stderr) 
    obtained from the M=5 imputed copies of the dataset. 
    The estimates are stacked and labeled by the variables VarName and VarLabel.
    You can inspect the dataset, for example, by opening ests in the 
     SAS Explorer window. 
*/
/* Next, combine these M=5 estimates to produce 
    a single MI point estimate and SE estimate. 
   You could do this using the MIANALYZE procedure that is built into SAS, 
    but use instead the macro %mi_combine, also provided,
    whose features support the two-stage approach.
   Below you will need to change the %include statement to point to the
    path that contains the %mi_combine macro on your system. 
*/
%include "C:\Users\pvonhippel\Box Sync\More imputations\Supplement\mi_combine.sas";
%mi_combine (inests=ests, outests=mi_ests,
    est=mean, se=stderr, label=VarName VarLabel, target_sd_se=.001);
/* The macro takes an argument target_sd_se,
	which specifies a target value for SD(SE_MI | Y_obs), 
    which the macro divides by the estimate SE_MI 
	to get a target value for CV(SE_MI | Y_obs). */
/* The macro also takes the argument inests, 
	which names the dataset containing the M estimates, 
	as well as the arguments est and se, 
	which name the columns of inests 
	that contain the M point estimates and SE estimates. */
/* The output of %mi_combine is a dataset named by the outests argument 
	(the name in our example is mi_ests), which contains 
	 an MI point estimate and SE estimate, 
	 a point estimate and 95% CI for the fraction of missing information,
	 and a recommendation for the number of imputations to use in the final analysis. 
	 The recommended imputations is calculated using a formula in my paper 
	  "How many imputations do you need?"
	 The recommended imputations appears in the output 
	  and is also copied to a global macro variable called &recommended_imputations. 
*/

/*	To get final estimates using the recommended number of imputations, 
	 simply rerun the MI procedure 
	 using &recommended_imputations instead of 5.
*/
proc mi NIMPUTE=&recommended_imputations data=bmi_observed out=mi_data;
  var c1bmi c2bmi c3bmi c4bmi;
  mcmc nbiter=10 niter=5;
   /* Since &recommended_imputations may be large, 
       these MCMC options speed runtime by reducing the number of iterations. */
run;
/* Finally, obtain and combine the &recommended_imputations estimates 
    using the SURVEYMEANS procedure 
    and %mi_combine macro, as before.*/
proc surveymeans data=mi_data plots=none;
 var c3bmi;
 BY _IMPUTATION_;
 ODS OUTPUT STATISTICS=ests;
run;
%mi_combine (inests=ests, outests=mi_ests,
    est=mean, se=stderr, label=VarName VarLabel, target_sd_se=.001);


/************ STREAMLINED  APPROACH *****************/
/* To streamline the two-stage approach, 
	you can wrap the three steps of MI -- imputation, analysis, and combination --
	in a single macro, called %mi_steps, 
	whose arguments let the user specify the number of imputations 
	and the target for SD(se). */
%macro mi_steps (IMPUTATIONS=, TARGET_SD_SE=, indata=);
 proc mi nimpute=&IMPUTATIONS data=&indata  out=mi_data;
  var c1bmi c2bmi c3bmi c4bmi; 
  mcmc nbiter=10 niter=5;
 run; 
 proc surveymeans data=mi_data plots=none;
  var c3bmi;
  by _imputation_;
  ods output statistics=ests;
 run;
 %mi_combine (inests=ests, outests=mi_ests,
    est=mean, se=stderr, label=varname varlabel, target_sd_se=&TARGET_SD_SE);
%mend;
/* Then to implement the two-stage approach, 
    you just call the macro twice --
    first with M=5 pilot imputations, 
    and then with the &recommended_imputations that emerged from the pilot.
*/
%mi_steps (IMPUTATIONS=5, TARGET_SD_SE=.001, indata=bmi_observed);
%mi_steps (IMPUTATIONS=&RECOMMENDED_IMPUTATIONs, TARGET_SD_SE=.001, indata=bmi_observed);


/************ EVEN MORE STREAMLINED  APPROACH *****************/
/* To streamline further, you can wrap both stages in another macro, 
    whose arguments let the user to specify the target for SD(SE)
    and the number of imputations to use in the pilot analysis.*/
%macro two_stage (PILOT_IMPUTATIONS=, TARGET_SD_SE=, indata=);
 %mi_steps (IMPUTATIONS=&pilot_imputations, TARGET_SD_SE=.001, indata=&indata);
 %mi_steps (IMPUTATIONS=&RECOMMENDED_IMPUTATIONs, TARGET_SD_SE=.001, indata=&indata);
%mend;
/* Then you can invoke the two-stage approach in a single line
    -- e.g., with 5 pilot imputations: */
%two_stage (PILOT_IMPUTATIONS=5, TARGET_SD_SE=.001, indata=bmi_observed);
/* or with 20 pilot imputations. */
%two_stage (PILOT_IMPUTATIONS=20, TARGET_SD_SE=.001, indata=bmi_observed);
